This is a report for investigating and making analysis of the movie database (TMDb) which has 10865 movies entries and 21 columns, and I will try to answer some of the following exploratory data analysis:
# http://ipython.readthedocs.io/en/stable/interactive/magics.html
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline
# Load your data and print out a few lines. Perform operations to inspect data
df_tmdb = pd.read_csv('tmdb-movies.csv')
df_tmdb.head()
# types and number or rows and cols
df_tmdb.info()
def print_rows_cols(df):
print('tmdb number of movies entries {} movies and {} columns'.format(df.shape[0]-1, df.shape[1]))
print_rows_cols(df_tmdb)
# count null in all columns to fill it later in data cleaning
def print_na_cols():
for col in df_tmdb.columns:
count_na = sum(df_tmdb[col].isna())
if count_na > 0:
print(col, ':', count_na)
print_na_cols()
# look for instances of missing or possibly errant data.
duplicated = df_tmdb.duplicated()
df_tmdb[duplicated]
# number of unique movies
df_tmdb.nunique()
Note: One id is duplicated I will drop later in cleaning
# some describtion values to represent the data
df_tmdb.describe()
df_tmdb.hist(figsize=(15,15), bins=20);
sns.pairplot(df_tmdb, size=20);
All columns names is lowercase and seperated by _ so I won't change it
1- There is one duplicated row and it will be removed
2- Columns that are not unused in my analysis and will be dropped:
- imdb_id
- keywords
- homepage
- overview
- release_date
- tagline
3- I have to handle columns that have invalid values like money or time = 0
- budget
- budget_adj
- revenue
- revenue_adj
- runtime
4- Columns that have null rows that have to be filled or dropped:
- imdb_id : 10 (will drop this column anyway)
- cast : 76
- homepage : 7930 (will drop this column anyway)
- director : 44
- tagline : 2824 (will drop this column anyway)
- keywords : 1493
- overview : 4 (will drop this column anyway)
- genres : 23
- production_companies : 1030
5- Columns that have values which have more than one seperated by '|' character, it can be handled for better analysis:
- cast
- director
- keywords
- genres
- production_companies
6- insert column which represent the movie profit with values equal revenue minus budget
# 1. remove duplicate rows
df_tmdb.drop_duplicates(inplace=True)
print_rows_cols(df_tmdb) # check duplicate row removed successfully
# 2. drop unused cols in the analysis
# I need the id as there is different movies with the same name
cols_drop_list = [
'imdb_id',
'keywords',
'homepage',
'overview',
'release_date',
'tagline'
]
df_tmdb = df_tmdb.drop(cols_drop_list, axis=1)
df_tmdb.head(2)
# 3. handle invalid zero value
cols_zero_to_nan = ['budget', 'budget_adj', 'revenue', 'revenue_adj', 'runtime']
df_tmdb[cols_zero_to_nan] = df_tmdb[cols_zero_to_nan].replace(0, np.NAN)
df_tmdb.describe()
print_na_cols()
print_rows_cols(df_tmdb)
# 4. drop rows with nan values
df_tmdb.dropna(subset=cols_zero_to_nan, inplace=True)
print_rows_cols(df_tmdb)
df_tmdb.info()
df_tmdb.head()
Note I lost alot of data by dropping the rows which has null value but still I have enough to make my analysis I could use the mean to fill them but I choose to drop them
split_list = ['cast', 'director', 'genres', 'production_companies']
df_dict = {}
for col in split_list:
df_dict[col] = df_tmdb[col].str.split('|', expand=True).rename(columns=lambda x: f"{col}_{x+1}")
df_dict[split_list[0]].head(2)
df_dict[split_list[1]].tail(2)
df_dict[split_list[2]].head(2)
df_dict[split_list[3]].head(2)
# drop split_list cols
df_tmdb.drop(split_list, axis=1, inplace=True)
# Join new cols
df_tmdb = df_tmdb.join(df_dict.values())
# check dataframe info
df_tmdb.info()
Note there is now a null values but it is only in splited cols it doesn't matter
# 6. insert profit column
df_tmdb.insert(loc=4, column='profit', value=df_tmdb['revenue'] - df_tmdb['budget'])
df_tmdb.head()
# consider movies entries only with more than 500 votes count
df_votes = df_tmdb[df_tmdb['vote_count'] > 500].loc[:, 'vote_count' : 'vote_average']
df_votes.head(2)
df_votes.plot(x='vote_count', y='vote_average', kind='scatter', figsize=(8,8));
df_votes.corr()
# consider movies entries only with more than 500 votes count
df_rating_profits = df_tmdb[df_tmdb['vote_count'] > 500][['profit', 'vote_average']]
df_rating_profits.head(2)
df_rating_profits.plot(x='profit', y='vote_average', kind='scatter', figsize=(8,8));
df_rating_profits.corr()
# get different genres unique values
genres_cols = [f"genres_{i}" for i in range(1,6)]
genres_list = []
for x in genres_cols:
genres_list += [y for y in list(df_tmdb[x].unique()) if y != None]
genres_list = list(set(genres_list))
print('Genres:', genres_list)
def get_empty_series():
sr = pd.Series([0.0 for _ in range(len(genres_list))])
sr.index = genres_list
return sr
total_profits_genres = get_empty_series()
count_movies_genres = get_empty_series()
tmp_series = get_empty_series()
for i in range(len(genres_cols)):
# calc total profit
tmp_series = get_empty_series()
tmp_series += df_tmdb.groupby(genres_cols[i])['profit'].sum()
tmp_series.fillna(0, inplace=True)
total_profits_genres += tmp_series
# calc avg profit
tmp_series = get_empty_series()
tmp_series += df_tmdb[genres_cols[i]].value_counts()
tmp_series.fillna(0, inplace=True)
count_movies_genres += tmp_series
avg_profits_genres = total_profits_genres / count_movies_genres
print('Total profit for each genres:')
total_profits_genres.sort_values(inplace=True)
print(total_profits_genres)
total_profits_genres.plot.pie(figsize=(15,15));
print('Number of movies from each genres:')
count_movies_genres.sort_values(inplace=True)
print(count_movies_genres)
print(count_movies_genres.index)
count_movies_genres.plot.pie(figsize=(15,15));
print('Avg profit for each genres:')
avg_profits_genres.sort_values(inplace=True)
print(avg_profits_genres)
avg_profits_genres.plot.pie(figsize=(15,15));
df_year = df_tmdb[['release_year']]
df_year_movies = df_year['release_year'].value_counts()
df_year_movies = df_year_movies.to_frame().sort_values('release_year').reset_index()
df_year_movies.columns = ['release_year', 'movies_count']
df_year_movies
df_year_movies.corr()
df_year_movies.plot(
x='release_year',
y='movies_count',
kind='line',
figsize=(15,10),
title="Number of movies over the years"
)
plt.xlabel('Release Year')
plt.ylabel('Number Of Movies');
# consider movies entries only with more than 500 votes count
df_rating_profits = df_tmdb[['profit', 'budget', 'runtime']]
df_rating_profits.head(2)
df_rating_profits[['budget', 'runtime']].plot(x='budget', y='runtime', kind='scatter', figsize=(8,8));
df_rating_profits[['profit', 'runtime']].plot(x='profit', y='runtime', kind='scatter', figsize=(8,8));
df_rating_profits.corr()
Answer1: The corralation does not indicate that higher the vote count the higher vote average.
Answer2: The corralation does not indicate that higher the vote average leads to higher profit (very low correclation = 0.1).
Answer3: I calculated the number of movies for each genres and also the total profit and average profit for each genres and it turns out the Foreign genre is the lowest average profit and Animation is the highest
Answer4: The Number of movies increased over the years from 1960 to 2015 as release year and number of movies are highly correlated (0.9).
Answer5: The corralation does not indicate that higher runtime of the movie the higher the budget or profit the correlation is low (0.2)
from subprocess import call
call(['python', '-m', 'nbconvert', 'Investigate_a_Dataset.ipynb'])